Trees and Forests


In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

Decision Tree Classification


In [2]:
from figures import plot_tree_interactive
plot_tree_interactive()


Random Forests


In [3]:
from figures import plot_forest_interactive
plot_forest_interactive()


Selecting the Optimal Estimator via Cross-Validation


In [4]:
from sklearn import grid_search
from sklearn.datasets import load_digits
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier

digits = load_digits()
X, y = digits.data, digits.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

rf = RandomForestClassifier(n_estimators=200, n_jobs=-1)
parameters = {'max_features':['sqrt', 'log2'],
              'max_depth':[5, 7, 9]}

clf_grid = grid_search.GridSearchCV(rf, parameters)
clf_grid.fit(X_train, y_train)


Out[4]:
GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'max_features': ['sqrt', 'log2'], 'max_depth': [5, 7, 9]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None, scoring=None,
       verbose=0)

In [5]:
clf_grid.score(X_train, y_train)


Out[5]:
1.0

In [6]:
clf_grid.score(X_test, y_test)


Out[6]:
0.97111111111111115

In [ ]: